#!/usr/bin/python
#coding=utf-8
#filename:getmovie.py

import lxml.html
import urllib2
import re

#取每一页的电影简介地址
def getmovieurl(url):
    print url
    html = urllib2.urlopen(url).read()
    doc = lxml.html.fromstring(html)
    p = doc.xpath("//div[@id='mainleft']/span/a")
    return p

#取电影简介
def getmoviedetail(url):
    print url
    html = urllib2.urlopen(url).read()
    doc = lxml.html.fromstring(html)
    
    #电影名
    pmname = doc.xpath("//div[@id='mainleft']//span[1]/div[1]/h1") 
    mname = pmname[0].text_content().encode('utf-8')
    print mname
    f.write(mname + "||")

    #电影时间
    ptime = doc.xpath("//div[@id='mainleft']/span[1]/div[1]/p") 
    t = re.compile(r'\d*-\d*-\d*')
    mtime = t.findall(ptime[0].text_content())[0]
    f.write(mtime + "||")

    #电影简介
    p = doc.xpath("//div[@id='mainleft']/span[1]/div[2]/p[1]")
    detail1 = p[0].text_content().encode('utf-8')
    mr = re.compile(r'<div class="neiwen">[\s\S]*"wumii-hook">', re.IGNORECASE)
    m = mr.search(html)
    detail2 = m.group()[:-24]
    f.write(detail1 + "||")
    f.write(detail2 + "|||")
    f.flush()
        


if __name__ == '__main__':
    f = open('mdata.txt', 'wb')   
    for i in range(1,6):
        url = r'http://www.dyxiazai.com/1080p/page/%d' %i
        murls = getmovieurl(url)
        for i in range(len(murls)):
            murl = murls[i].get('href')
            getmoviedetail(murl)
    f.close()
